Projet Airbnb Ilan ZINI et Mark Killian ZINENBERG (groupe L)¶
Bibliothèques¶
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import xgboost as xgb
Exploration des données du airbnb_train et du airbnb_test¶
On importe les données de 2 fichiers, le fichier d'entrainement et le fichier de test.
train_data = pd.read_csv('airbnb_train.csv')
test_data = pd.read_csv('airbnb_test.csv')
A quoi ressemble les fichiers de données ?
print("Ficher d'entraînement:")
train_data.head()
Ficher d'entraînement:
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | last_review | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5708593 | 4.317488 | House | Private room | {TV,"Wireless Internet",Kitchen,"Free parking ... | 3 | 1.0 | Real Bed | flexible | False | ... | NaN | 33.782712 | -118.134410 | Island style Spa Studio | Long Beach | 0 | NaN | 90804 | 0.0 | 2.0 |
| 1 | 14483613 | 4.007333 | House | Private room | {"Wireless Internet","Air conditioning",Kitche... | 4 | 2.0 | Real Bed | strict | False | ... | 2017-09-17 | 40.705468 | -73.909439 | Beautiful and Simple Room W/2 Beds, 25 Mins to... | Ridgewood | 38 | 86.0 | 11385 | 1.0 | 2.0 |
| 2 | 10412649 | 7.090077 | Apartment | Entire home/apt | {TV,"Wireless Internet","Air conditioning",Kit... | 6 | 2.0 | Real Bed | flexible | False | ... | NaN | 38.917537 | -77.031651 | 2br/2ba luxury condo perfect for infant / toddler | U Street Corridor | 0 | NaN | 20009 | 2.0 | 2.0 |
| 3 | 17954362 | 3.555348 | House | Private room | {TV,"Cable TV",Internet,"Wireless Internet","A... | 1 | 1.0 | Real Bed | flexible | True | ... | 2017-09-29 | 40.736001 | -73.924248 | Manhattan view from Queens. Lovely single room . | Sunnyside | 19 | 96.0 | 11104 | 1.0 | 1.0 |
| 4 | 9969781 | 5.480639 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | moderate | True | ... | 2017-08-28 | 37.744896 | -122.430665 | Zen Captured Noe Valley House | Noe Valley | 15 | 96.0 | 94131 | 2.0 | 2.0 |
5 rows × 28 columns
print("Ficher de test:")
test_data.head()
Ficher de test:
| Unnamed: 0 | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | city | ... | last_review | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14282777 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | Real Bed | strict | True | NYC | ... | 2016-07-18 | 40.696524 | -73.991617 | Beautiful brownstone 1-bedroom | Brooklyn Heights | 2 | 100.0 | 11201 | 1.0 | 1.0 |
| 1 | 17029381 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | Real Bed | strict | True | NYC | ... | 2017-09-23 | 40.766115 | -73.989040 | Superb 3BR Apt Located Near Times Square | Hell's Kitchen | 6 | 93.0 | 10019 | 3.0 | 3.0 |
| 2 | 7824740 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | True | NYC | ... | 2017-09-14 | 40.808110 | -73.943756 | The Garden Oasis | Harlem | 10 | 92.0 | 10027 | 1.0 | 3.0 |
| 3 | 19811650 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | flexible | True | SF | ... | NaN | 37.772004 | -122.431619 | Beautiful Flat in the Heart of SF! | Lower Haight | 0 | NaN | 94117.0 | 2.0 | 2.0 |
| 4 | 12410741 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | Real Bed | moderate | True | DC | ... | 2017-01-22 | 38.925627 | -77.034596 | Great studio in midtown DC | Columbia Heights | 4 | 40.0 | 20009 | 0.0 | 1.0 |
5 rows × 27 columns
Nous regardons des infos sur nos données :
train_data.columns
Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
'cleaning_fee', 'city', 'description', 'first_review',
'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
'host_since', 'instant_bookable', 'last_review', 'latitude',
'longitude', 'name', 'neighbourhood', 'number_of_reviews',
'review_scores_rating', 'zipcode', 'bedrooms', 'beds'],
dtype='object')
train_data.shape
(22234, 28)
test_data.columns
Index(['Unnamed: 0', 'property_type', 'room_type', 'amenities', 'accommodates',
'bathrooms', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
'description', 'first_review', 'host_has_profile_pic',
'host_identity_verified', 'host_response_rate', 'host_since',
'instant_bookable', 'last_review', 'latitude', 'longitude', 'name',
'neighbourhood', 'number_of_reviews', 'review_scores_rating', 'zipcode',
'bedrooms', 'beds'],
dtype='object')
test_data.shape
(51877, 27)
Il y a une colonne en moins dans le fichier de test, c'est normal, il n'y a pas la colonne du logarithme du prix, on doit la prédire.
train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22234 entries, 0 to 22233 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 22234 non-null int64 1 log_price 22234 non-null float64 2 property_type 22234 non-null object 3 room_type 22234 non-null object 4 amenities 22234 non-null object 5 accommodates 22234 non-null int64 6 bathrooms 22183 non-null float64 7 bed_type 22234 non-null object 8 cancellation_policy 22234 non-null object 9 cleaning_fee 22234 non-null bool 10 city 22234 non-null object 11 description 22234 non-null object 12 first_review 17509 non-null object 13 host_has_profile_pic 22178 non-null object 14 host_identity_verified 22178 non-null object 15 host_response_rate 16759 non-null object 16 host_since 22178 non-null object 17 instant_bookable 22234 non-null object 18 last_review 17518 non-null object 19 latitude 22234 non-null float64 20 longitude 22234 non-null float64 21 name 22234 non-null object 22 neighbourhood 20148 non-null object 23 number_of_reviews 22234 non-null int64 24 review_scores_rating 17256 non-null float64 25 zipcode 21931 non-null object 26 bedrooms 22208 non-null float64 27 beds 22199 non-null float64 dtypes: bool(1), float64(7), int64(3), object(17) memory usage: 4.6+ MB
train_data.describe()
| id | log_price | accommodates | bathrooms | latitude | longitude | number_of_reviews | review_scores_rating | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.223400e+04 | 22234.000000 | 22234.000000 | 22183.000000 | 22234.000000 | 22234.000000 | 22234.000000 | 17256.000000 | 22208.000000 | 22199.000000 |
| mean | 1.122269e+07 | 4.783481 | 3.155573 | 1.236037 | 38.462971 | -92.269305 | 20.670774 | 94.069077 | 1.264769 | 1.711473 |
| std | 6.080480e+06 | 0.718758 | 2.143870 | 0.586246 | 3.071679 | 21.670081 | 37.183731 | 7.782235 | 0.852819 | 1.254903 |
| min | 3.362000e+03 | 2.302585 | 1.000000 | 0.000000 | 33.339002 | -122.510940 | 0.000000 | 20.000000 | 0.000000 | 0.000000 |
| 25% | 6.202924e+06 | 4.317488 | 2.000000 | 1.000000 | 34.136082 | -118.340633 | 1.000000 | 92.000000 | 1.000000 | 1.000000 |
| 50% | 1.217425e+07 | 4.700480 | 2.000000 | 1.000000 | 40.662632 | -76.994944 | 6.000000 | 96.000000 | 1.000000 | 1.000000 |
| 75% | 1.639502e+07 | 5.220356 | 4.000000 | 1.000000 | 40.746358 | -73.954599 | 23.000000 | 100.000000 | 1.000000 | 2.000000 |
| max | 2.120450e+07 | 7.600402 | 16.000000 | 8.000000 | 42.390248 | -70.989359 | 505.000000 | 100.000000 | 10.000000 | 18.000000 |
Il y a t-il des valeurs manquantes dans le fichier test ?
print(str('Any missing data or NaN in the dataset:'),test_data.isnull().values.any())
Any missing data or NaN in the dataset: True
Il y a effectivement des valeurs manquantes dans le fichier test. Regardons combien il y en a de manquantes par caractéristique.
missing_value_test = test_data.isna()
print("Pour chaque caractéristique du fichier test, il manque tant de valeurs: ")
print(missing_value_test.sum())
Pour chaque caractéristique du fichier test, il manque tant de valeurs: Unnamed: 0 0 property_type 0 room_type 0 amenities 0 accommodates 0 bathrooms 149 bed_type 0 cancellation_policy 0 cleaning_fee 0 city 0 description 0 first_review 11139 host_has_profile_pic 132 host_identity_verified 132 host_response_rate 12824 host_since 132 instant_bookable 0 last_review 11111 latitude 0 longitude 0 name 0 neighbourhood 4786 number_of_reviews 0 review_scores_rating 11744 zipcode 663 bedrooms 65 beds 96 dtype: int64
Nous remarquons qu'il n'y a aucune valeur manquante pour certaines caractéristiques et au contraire, un certain nombre de valeurs manquantes pour d'autres. Dans ce qui suit (préparation des données), nous allons faire en sorte de ne garder aucune valeur manquante dans nos fichers d'entrainement et de test.
Préparation des données¶
Beaucoup (trop) de valeurs sont manquantes dans les colonnes: first_review, last_review, review_scores_rating et host_since. Nous supprimons donc ces colonnes du fichier d'entrainement.
Nous supprimons également les colonnes name et description car nous estimons qu'elles sont inutiles dans le cadre de ce projet de prédiction.
train_data.drop(columns=['first_review', 'last_review', 'review_scores_rating','host_since', 'name', 'description'], inplace=True)
Dans les colonnes contenant un nombre "raisonnable" de valeurs manquantes, nous choisissons de remplacer les valeurs manquantes par la médiane afin d'éviter des erreurs de logique (0 chambres n'a pas de sens) et afin de réduire la sensibilité aux valeurs extrêmes. Cela concerne les caractéristiques 'bathrooms', 'bedrooms' et 'beds'.
train_data['bathrooms'].fillna(train_data['bathrooms'].median(), inplace=True)
train_data['bedrooms'].fillna(train_data['bedrooms'].median(), inplace=True)
train_data['beds'].fillna(train_data['beds'].median(), inplace=True)
Pour les colonnes de type booléennes / catégorielles, nous remplaçons leurs valeurs manquantes par la valeur la plus fréquente dans la colonne. Cela concerne les caractéristiques 'host_has_profile_pic', 'host_identity_verified', 'host_response_rate', 'zipcode' et 'neighbourhood'.
train_data['host_has_profile_pic'].fillna(train_data['host_has_profile_pic'].mode()[0], inplace=True)
train_data['host_identity_verified'].fillna(train_data['host_identity_verified'].mode()[0], inplace=True)
train_data['host_response_rate'].fillna(train_data['host_response_rate'].mode()[0], inplace=True)
train_data['zipcode'].fillna(train_data['zipcode'].mode()[0], inplace=True)
train_data['neighbourhood'].fillna(train_data['neighbourhood'].mode()[0], inplace=True)
Manque-t-il encore des valeurs dans nos données d'entrainement ?
print(str('Any missing data or NaN in the dataset:'),train_data.isnull().values.any())
Any missing data or NaN in the dataset: False
Non, il ne manque plus de valeurs.
Exécution du même process pour les données de test :
test_data.drop(columns=['first_review', 'last_review', 'review_scores_rating','host_since', 'name', 'description'], inplace=True)
test_data['bathrooms'].fillna(train_data['bathrooms'].median(), inplace=True)
test_data['bedrooms'].fillna(train_data['bedrooms'].median(), inplace=True)
test_data['beds'].fillna(train_data['beds'].median(), inplace=True)
test_data['host_has_profile_pic'].fillna(train_data['host_has_profile_pic'].mode()[0], inplace=True)
test_data['host_identity_verified'].fillna(train_data['host_identity_verified'].mode()[0], inplace=True)
test_data['host_response_rate'].fillna(train_data['host_response_rate'].mode()[0], inplace=True)
test_data['zipcode'].fillna(train_data['zipcode'].mode()[0], inplace=True)
test_data['neighbourhood'].fillna(train_data['neighbourhood'].mode()[0], inplace=True)
print("Pour chaque caractéristique du fichier test, il manque tant de valeurs: ")
print(test_data.isnull().sum())
Pour chaque caractéristique du fichier test, il manque tant de valeurs: Unnamed: 0 0 property_type 0 room_type 0 amenities 0 accommodates 0 bathrooms 0 bed_type 0 cancellation_policy 0 cleaning_fee 0 city 0 host_has_profile_pic 0 host_identity_verified 0 host_response_rate 0 instant_bookable 0 latitude 0 longitude 0 neighbourhood 0 number_of_reviews 0 zipcode 0 bedrooms 0 beds 0 dtype: int64
A présent, transformons les colonnes catégorielles en type numérique:
class CustomTransformation():
def __init__(self):
"""
Class simple pour convertir les type de propriétés en des indices numériques, utilisable pour un algo de machine learning
"""
self.fitted = False # Indique si fit_transform a été utilisé, pour éviter d’utiliser transform sans que fit ai été appelé
self.property2index = dict() # Dictionnaire qui va convertir le nom en indice
self.max_index = 0 # Indique le dernier indice de la propriété.
def fit_transform(self, dataset, colonne):
self.fitted = True
# Récupère les types de colonne
types = dataset[colonne].unique()
print("Tous les types de", colonne, ":\n", types)
print("")
self.property2index = {prop:i for (i, prop) in enumerate(types)}
self.max_index = max(list(self.property2index.values()))
# transform
return self.transform(dataset, colonne)
def transform(self, dataset, colonne):
indice_inconnu = self.max_index + 1
dataset.loc[:, colonne] = dataset[colonne].replace(self.property2index) # Transforme les colonnes en indice
dataset.loc[dataset[colonne].map(type).eq(str), colonne] = indice_inconnu # pour remplacer les lignes qui ont des valeurs qui n’étaient pas dans l’entrainement
return dataset
Transformation des caractéristiques suivantes :
- property_type
- room_type
- bed_type
- cancellation_policy
- city
- neighbourhood
- host_has_profile_pic
- host_identity_verified
- instant_bookable
- zipcode
features_transformer = CustomTransformation()
columns = ["property_type", "room_type", "bed_type", "cancellation_policy", "city", "neighbourhood",
"host_has_profile_pic", "host_identity_verified", "instant_bookable","zipcode"]
Pour le fichier d'entrainement:
for col in columns:
train_data_numerized = features_transformer.fit_transform(train_data, col)
train_data_numerized.head()
Tous les types de property_type : ['House' 'Apartment' 'Townhouse' 'Guest suite' 'Condominium' 'Timeshare' 'Chalet' 'Guesthouse' 'Bungalow' 'Loft' 'In-law' 'Boat' 'Dorm' 'Other' 'Bed & Breakfast' 'Camper/RV' 'Villa' 'Boutique hotel' 'Cabin' 'Hostel' 'Hut' 'Yurt' 'Serviced apartment' 'Castle' 'Vacation home' 'Tent' 'Cave' 'Tipi' 'Earth House' 'Island' 'Treehouse'] Tous les types de room_type : ['Private room' 'Entire home/apt' 'Shared room'] Tous les types de bed_type : ['Real Bed' 'Pull-out Sofa' 'Futon' 'Airbed' 'Couch'] Tous les types de cancellation_policy : ['flexible' 'strict' 'moderate' 'super_strict_30' 'super_strict_60'] Tous les types de city : ['LA' 'NYC' 'DC' 'SF' 'Chicago' 'Boston'] Tous les types de neighbourhood : ['Long Beach' 'Ridgewood' 'U Street Corridor' 'Sunnyside' 'Noe Valley' 'West Village' 'Harlem' 'Flushing' 'Westside' 'Upper West Side' 'Shepherd Park' 'Santa Monica' 'Mission District' 'Murray Hill' 'Williamsburg' 'Chinatown' 'Echo Park' 'Hamilton Heights' 'Mar Vista' 'Encino' 'Kips Bay' 'West Hollywood' 'Carroll Gardens' 'Downtown' 'Bedford-Stuyvesant' 'Wicker Park' "Hell's Kitchen" 'Upper East Side' 'Pasadena' 'Shaw' 'Greenpoint' 'Jackson Heights' 'Clinton Hill' 'Tompkinsville' 'Torrance' 'Beverly Hills' 'Midtown' 'Financial District' 'Fort Greene' 'Pacific Heights' 'Mid-City' 'Chelsea' 'Venice' 'Crown Heights' 'South LA' 'Bushwick' 'Parkchester' 'Glendale' 'Columbia Street Waterfront' 'East Flatbush' 'Western Addition/NOPA' 'Hollywood' 'East Village' 'Lower East Side' 'Nolita' 'East Elmhurst' 'Soho' 'The Rockaways' 'Beacon Hill' 'Forest Hills' 'Chevy Chase' 'Flatbush' 'Lefferts Garden' 'Park Slope' 'East Harlem' 'Compton' 'Glover Park' 'Cahuenga Pass' 'East Hollywood' 'Trinidad' 'Elmhurst' 'Westwood' 'Astoria' 'Southwest Waterfront' 'Near Northeast/H Street Corridor' 'Mount Vernon Square' 'Downtown/Penn Quarter' 'Back Bay' 'Silver Lake' 'Nob Hill' 'Whittier' 'Mid-Wilshire' 'Richmond District' 'Inglewood' 'Granada Hills North' 'Coney Island' 'Russian Hill' 'Westlake' 'Jamaica Plain' 'Ditmars / Steinway' 'Midtown East' 'Capitol Hill' 'Monterey Park' 'East Boston' 'Westchester/Playa Del Rey' 'Sunset Park' 'Alphabet City' 'Mattapan' 'Cole Valley' 'Charlestown' 'Fenway/Kenmore' 'Gramercy Park' 'South Beach' 'West End' 'The Castro' 'Telegraph Hill' 'Sheepshead Bay' 'Greenwich Village' 'Washington Heights' 'Toluca Lake' 'Bucktown' 'Jamaica' 'Outer Sunset' 'Eastchester' 'Georgetown' '16th Street Heights' 'Little Italy/UIC' 'Morningside Heights' 'South Boston' 'Adams Morgan' 'Cleveland Park' 'Near North Side' 'Roscoe Village' 'Cathedral Heights' 'South Loop/Printers Row' 'Humboldt Park' 'Studio City' 'North End' 'Los Feliz' 'Malibu' 'Ukrainian Village' 'Hyde Park' 'Avondale' 'North Hollywood' 'South Shore' 'Inner Sunset' 'Hermosa Beach' 'Cerritos' 'Eckington' 'Roslindale' 'Lower Haight' 'Prospect Heights' 'South End' 'Burbank' 'Hollywood Hills' 'Arcadia' 'Sherman Oaks' 'Bernal Heights' 'St. Elizabeths' 'Palisades' 'Borough Park' 'Flatiron District' 'Brighton Beach' 'Gowanus' 'Pilsen' 'Manhattan Beach' 'Duboce Triangle' 'West Los Angeles' 'West Hills' 'El Segundo' 'Roxbury' 'Van Nuys' 'Columbia Heights' 'Bloomingdale' 'Park View' 'Midwood' 'Lakeview' 'Logan Circle' 'Pleasant Plains' 'West Adams' 'Brentwood' 'Kensington' 'Arts District' 'Palms' 'Gardena' 'Bayview' 'South Chicago' 'Magnificent Mile' 'Woodridge' 'Haight-Ashbury' 'Redondo Beach' 'Topanga' 'Lawndale' 'Loop' 'Edgewood' 'Dorchester' 'Dogpatch' 'Carson' 'Windsor Terrace' 'Kingsbridge Heights' 'Corona' 'Downtown Brooklyn' 'Irving Park' 'Rogers Park' 'Bronzeville' 'Glen Park' 'South Pasadena' "Fisherman's Wharf" 'SoMa' 'Hermosa' 'Valley Village' 'Woodside' 'Altadena' 'River North' 'Logan Square' 'Judiciary Square' 'Greenwood Heights' 'Allston-Brighton' 'South Street Seaport' 'Manor Park' 'Marina Del Rey' 'Alhambra' 'Anacostia' 'Alamo Square' 'Woodland Hills/Warner Center' 'Foggy Bottom' 'Twin Peaks' 'Andersonville' 'Del Rey' 'Richmond Hill' 'Tarzana' 'Tottenville' 'Truxton Circle' 'Bel Air/Beverly Crest' 'Congress Heights' 'Mount Pleasant' 'Bayside' 'Lynwood' 'Barney Circle' 'Oakland' 'Bridgeport' 'Reseda' 'Fresh Meadows' 'Morris Heights' 'Twining' 'Boerum Hill' 'Monterey Hills' 'Fairlawn' 'Pacific Palisades' 'San Gabriel' 'West Town/Noble Square' 'Harbor Gateway' 'Canarsie' 'Dupont Circle' 'Crotona' 'West Farms' 'Old Town' 'West Loop/Greektown' 'Brookland' 'Uptown' 'Brooklyn' 'Brooklyn Heights' 'Brooklyn Navy Yard' 'Gravesend' 'Concourse Village' 'Woodley Park' 'Ingleside' 'Times Square/Theatre District' 'Parkside' 'Atwater Village' 'Boyle Heights' 'Laurel Canyon' 'Concourse' 'Fordham' 'Red Hook' 'Kalorama' 'Arboretum' 'Roosevelt Island' 'Lomita' 'Temple City' 'Montecito Heights' 'Hayes Valley' 'Crestwood' 'Mission Hill' 'Cow Hollow' 'Sun Valley' 'Lake Balboa' 'Highland Park' 'Queens' 'Mount Washington' 'Petworth' 'Lindenwood' 'Takoma' 'Bensonhurst' 'South Robertson' 'West Ridge' 'Kenwood' 'Tribeca' 'Mission Terrace' 'Hawthorne' 'Potrero Hill' 'Castle Hill ' 'Winnetka' 'Valley Glen' 'East New York' 'San Pedro' 'Inwood' 'Kent' 'Cobble Hill' 'Long Island City' 'Brownsville' 'Little Village' 'Wakefield' 'Buena Vista' 'North Cleveland Park' 'Lincoln Park' 'Kingsbridge' 'El Sereno' 'Park Versailles' 'Crocker Amazon' 'Union Square' 'Michigan Park' 'Morrisania' 'La Crescenta-Montrose' 'Noho' 'Culver City' 'Baldwin Hills' 'Lincoln Heights' 'Bay Ridge' 'Duarte' 'Panorama City' 'Forest Hill' 'Port Morris' 'Sunland/Tujunga' 'Central Northeast/Mahaning Heights' 'Middle Village' 'Van Nest' 'Meatpacking District' 'Burleith' 'Hillbrook' 'Benning Ridge' 'Woodland' 'LeDroit Park' 'Baychester' 'Oceanview' 'West Covina' 'Lakeshore' 'University Heights' 'Maspeth' 'Mott Haven' 'Belmont' 'Deanwood' 'Marina' 'North Beach' 'Tenderloin' 'Civic Center' 'Balboa Terrace' 'New Dorp Beach' 'Norwood Park' 'Skid Row' 'Flatlands' 'Northridge' 'West Portal' 'Glassell Park' 'Albany Park' 'North Hills West' 'West Brighton' 'Meiers Corners' 'Allerton' 'Azusa' 'Downtown Crossing' 'Rosemead' 'Canoga Park' 'Gold Coast' 'Edgewater' 'Takoma Park, MD' 'Mt Rainier/Brentwood, MD' 'North Park' 'Palos Verdes' 'Washington Highlands' 'Glendora' 'Fort Davis' 'Bedford Park' 'Colonial Village' 'Lakewood' 'American University Park' 'South Ozone Park' 'Diamond Heights' 'Battery Park City' 'Lamond Riggs' 'Portage Park' 'Hudson Square' 'Silver Spring, MD' 'Streeterville' 'Navy Yard' 'Co-op City' 'West Puente Valley' 'Rego Park' 'West Roxbury' 'City Island' 'Eagle Rock' 'Bellflower' 'Theater District' 'Downey' 'North Center' 'Riverdale' 'Pleasant Hill' 'Dyker Heights' 'Arleta' 'Cypress Park' 'Woodlawn' 'Excelsior' 'Visitacion Valley' 'Kingman Park' 'Monrovia' 'Great Kills' 'Williamsbridge' 'Norwood' 'Morgan Park' 'Portola' 'River West' 'Near Northeast' 'Mission Bay' 'Armour Square' 'South Whittier' 'Elysian Valley' 'Marine Park' 'Woodhaven' 'Harvard Square' 'The Bronx' 'Tremont' 'Sierra Madre' 'Jefferson Park' 'Dunning' 'East San Gabriel' 'Boystown' 'Presidio Heights' 'Howard Beach' 'La Mirada' 'Soundview' 'Beverly' 'Melrose' 'Carver Langston' 'El Monte' 'Bronxdale' 'Rancho Palos Verdes' 'Ozone Park' 'Bath Beach' 'Garfield Park' 'Little Italy' 'Chatsworth' 'McKinley Park' 'Claremont' 'Naylor Gardens' 'Pico Rivera' 'Douglass' 'Montebello' 'Manhattan' 'Lincoln Square' 'Signal Hill' 'Watts' 'Florence-Graham' 'Brightwood' 'Midland Beach' 'Pelham Bay' 'Sylmar' 'Near West Side' 'Eltingville' 'College Point' 'Wrigleyville' 'Hermon' 'St. George' 'Randall Manor' 'Highbridge' 'Belmont Cragin' 'DUMBO' 'East Los Angeles' 'Kew Garden Hills' 'Santa Fe Springs' 'Huguenot' 'Dupont Park' 'Mount Eden' 'Englewood' 'Covina' 'Bergen Beach' 'New Brighton' 'Hunts Point' 'River Terrace' 'Foxhall' 'Benning' 'La Canada Flintridge' 'Hillcrest' 'Garfield Ridge' 'Stronghold' 'Norwalk' 'Westchester Village' 'San Marino' 'Daly City' 'East Corner' 'Presidio' 'Stapleton' 'Back of the Yards' 'Roseland' 'North Michigan Park' 'West Lawn' 'Ivy City' 'Chestnut Hill' 'Japantown' 'Sea Gate' 'Commerce' 'Throgs Neck' 'Fort Lincoln' 'North Lawndale' 'Austin' 'Wesley Heights' 'Randle Highlands' 'La Puente' 'Good Hope' 'Somerville' 'Winthrop' 'Porter Ranch' 'Longwood' 'Berkley' 'Fort Wadsworth' 'Elm Park' 'Marshall Heights' 'South El Monte' 'Eastland Gardens' 'Vinegar Hill' 'Whitestone' 'Rosebank' 'Langdon' 'Sea Cliff' 'Grymes Hill' 'Montclare' 'Marble Hill' 'Harbor City' 'Bell' 'Spuyten Duyvil' 'Brookline' 'Bellevue' 'Newton' 'Watertown' 'West Athens' 'Huntington Park' "O'Hare" 'Grasmere' 'Pacoima' 'Edenwald' 'South San Gabriel' 'Gerritsen Beach' 'Rossville' 'Greenway' 'Westmont' 'Port Richmond' 'La Habra' 'Leather District' 'Irwindale' 'Rolling Hills' 'Grant City' 'Baldwin Park' 'South Gate' 'Hawaiian Gardens' 'North Hills East' 'Friendship Heights' 'Paramount' 'Bradbury' 'Spring Valley'] Tous les types de host_has_profile_pic : ['t' 'f'] Tous les types de host_identity_verified : ['f' 't'] Tous les types de instant_bookable : ['t' 'f'] Tous les types de zipcode : ['90804' '11385' '20009' '11104' '94131' '10014' '10027' '11355' '90064' '10024' '20002' '90404' '94110' '10016' '60657' '02111' '90026' '10031' '90066' '90405' '10002' '90094' '91316' '11211.0' '90046' '11231.0' '94109' '11221' '91601' '60622' '11249.0' '10019' '10065' '60642' '91105' '20001' '11222' '11370.0' '11378' '11205.0' '11233' '10128' '10301' '90278' '11206.0' '90048' '10026' '10038' '11205' '90019' '90403' '10001.0' '90291' '11225.0' '90015' '11237' '90212' '94108' '10462' '91207' '11216' '11212.0' '94117' '90028' '60608' '90815' '11213.0' '10003.0' '11207' '10012' '11369.0' '10021' '11216.0' '11693' '10018' '02114' '90006' '20008' '20015' '11226' '11225' '11215' '10029.0' '90222' '20007' '90068' '90029' '11238.0' '11377' '91101' '90024' '11106' '20024' '90016' '11103' '90005' '20005' '02116' '10028' '10025' '91302' '91364' '20032' '90039' '11206' '94133' '11237.0' '90601' '60601' '90036' '94118' '91766' '10011.0' '90301' '94103' '91344' '94115' '11224' '10075' '20011' '90021' '02130' '11105' '90065' '90049' '10022' '20003' '90069' '91754' '02128' '90293' '11220' '10001' '10009.0' '02126' '11372' '10010' '10002.0' '10023' '02129' '02215' '60626' '10003' '90604' '10035.0' '94105' '90249' '94114' '11235' '91790' '10011' '10040' '10036' '91602' '11238' '90035' '11422' '94122' '10032' '10469' '60612' '11232' '90014' '02210' '90018' '60610' '10013.0' '60618' '60616' '60647' '91423' '02113' '02108' '90027' '90265' '60637' '11692' '94121' '91605' '60649' '90254' '90703' '11102' '02131' '11101' '02118' '91205' '90023' '10017' '91506' '91007' '20010' '20016' '11204' '90025' '91307' '90245' '90803' '02119' '91405' '10013' '11230' '02127' '91748' '91107' '91745' '11233.0' '60613' '90007' '11211' '11218' '90012' '90034' '10039' '94124' '60617' '90038' '60611' '90277' '20018' '90290' '60625' '90260' '60605' '90502' '60641' '02125' '90247' '10004' '94107' '90746' '11217' '10463' '91010' '11368' '60654' '90808' '90004' '60653' '91301' '90802' '91030' '91502' '90020' '90302' '94123' '90650' '91776' '91006' '02124' '91001' '90814' '02134' '94115.0' '60607' '90813' '90292' '91803' '20020' '11375' '20037' '60640' '11222.0' '11418' '91356' '10307.0' '91403' '90810' '90077' '91767' '11364' '94102' '10037.0' '90262' '60609' '91335' '11365' '10453' '90266' '10033' '91202' '20019' '90013' '11210.0' '11412' '11373.0' '94117.0' '90505' '90032' '11207.0' '91765' '91204' '10027.0' '90272' '90402' '90501' '11236.0' '91604' '11229' '91606' '10459' '91367' '10460' '91436' '10037' '11203.0' '91106' '60661' '20017' '91789' '91201' '02109' '10304' '11201' '10030' '11223' '20012' '10451.0' '02135' '94112' '91321' '94116' '90043' '11379' '90033' '90713' '94127' '10468' '11231' '90062' '90232' '90057' '11432' '11434' '10044' '91504' '90717' '91780' '90031' '10009' '20036' '11370' '94109.0' '90045' '02115' '11691' '91755' '91104' '91352' '91325' '11249' '90042' '91411' '90805' '90303' '11414' '90250' '11236' '60633' '91505' '90210' '60645' '60615' '90503' '11219' '11226.0' '11203' '10473' '91203' '91306' '91401' '90732' '10034' '91801' '90017' '11214' '90220' '11201.0' '11212' '60631' '60623' '10466' '91746' '11435' '60614' '10472' '90008' '10006' '91387' '10456.0' '94130' '10005' '20268' '91214' '90806' '91711' '11210' '91208' '02121' '11209' '94102.0' '91331' '91792' '90061' '10451' '91040' '90230' '91311' '91355' '02120' '11213' '94132' '91042' '02199' '91791' '60602' '10456' '10458' '90504' '10038.0' '91501' '10306' '10035' '11109' '11234' '90056' '91343' '91304' '20006' '91206' '7302.0' '10314.0' '10467' '91702' '91770' '90731' '91607' '60660' '20912' '20712' '90274' '90401' '91354' '91741' '60805' '02110' '91361' '11420' '11368.0' '10280' '20052' '60634' '11208.0' '20910' '10007.0' '10007' '11224.0' '10475.0' '11374' '02132' '10464' '90041' '94111' '90706' '91103' '90242' '94104.0' '94104' '10459.0' '91390' '10014.0' '02136' '91324' '10471.0' '90211' '11228.0' '90280' '93550' '11433' '91768' '90241' '94134' '10475' '91786' '11358' '91016' '90010' '10308' '11416' '10307' '94114.0' '60643' '02122' '94158' '91108' '90605' '90715' '10279' '11234.0' '90807' '11421' '02138' '10704' '10457' '10305' '91024' '10452.0' '60630' '90067' '60651' '91775' '90631' '91802' '10310' '90712' '90638' '10473.0' '91406' '90044' '90403-2638' '93563' '91377' '91731' '90275' '90606' '60624' '91732' '90660' '91303' '90640' '11417' '90037' '1m' '90602' '90755' '90059' '90001' '10461' '91342' '10312' '11356.0' '91708' '11354' '90304' '10282' '91326' '91351' '91773' '10452' '90305' '60659' '90063' '10069' '10018.0' '11361' '11367' '90670' '91733' '10457.0' '60621' '10454' '91722' '90022' '94118.0' '90704' '91723' '11209.0' '10474' '10453.0' '91011' '11411.0' '94014' '11360' '90248' '60638' '90221' '90716' '90034-2203' '60639' '94014.0' '90035-4475' '94129' '60636' '10304.0' '11694' '60619' '11429' '91724' '11220.0' '11423' '20004' '60629' '91381' '11509.0' '93536' '02467' '90040' '10465' '11415' '60606' '60644' '93551' '91744' '11412.0' '11413' '02145' '11001' '10010.0' '60302' '11419' '02152' '91020' '93534' '10455' '91340' '11362.0' '11363.0' '11372.0' '10303' '11411' '60660-1448' '91750' '11221.0' '10471' '90745' '11357' '20816' '11373' '90036-2514' '10026.0' '11429.0' '60603' '10308.0' '94401' '60707' '11228' '91210' '90710' '90201' '91402' '02186' '02445' '10162' '02458' '02472' '90047' '90255' '11426' '11215.0' '60656' '90240' '10004.0' '10305.0' '91384' '90744' '93535' '90003' '10128.0' '93543' '11428' '10309' '10463.0' '10270' '9004' '10282.0' '11239.0' '93552' '91706' '90011' '90723' '91362' '91008' '11436' '10012.0']
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | host_identity_verified | host_response_rate | instant_bookable | latitude | longitude | neighbourhood | number_of_reviews | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5708593 | 4.317488 | 0 | 0 | {TV,"Wireless Internet",Kitchen,"Free parking ... | 3 | 1.0 | 0 | 0 | False | ... | 0 | 100% | 0 | 33.782712 | -118.134410 | 0 | 0 | 0 | 0.0 | 2.0 |
| 1 | 14483613 | 4.007333 | 0 | 0 | {"Wireless Internet","Air conditioning",Kitche... | 4 | 2.0 | 0 | 1 | False | ... | 1 | 100% | 0 | 40.705468 | -73.909439 | 1 | 38 | 1 | 1.0 | 2.0 |
| 2 | 10412649 | 7.090077 | 1 | 1 | {TV,"Wireless Internet","Air conditioning",Kit... | 6 | 2.0 | 0 | 0 | False | ... | 0 | 100% | 0 | 38.917537 | -77.031651 | 2 | 0 | 2 | 2.0 | 2.0 |
| 3 | 17954362 | 3.555348 | 0 | 0 | {TV,"Cable TV",Internet,"Wireless Internet","A... | 1 | 1.0 | 0 | 0 | True | ... | 1 | 100% | 1 | 40.736001 | -73.924248 | 3 | 19 | 3 | 1.0 | 1.0 |
| 4 | 9969781 | 5.480639 | 0 | 1 | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | 0 | 2 | True | ... | 1 | 100% | 1 | 37.744896 | -122.430665 | 4 | 15 | 4 | 2.0 | 2.0 |
5 rows × 22 columns
Pour le fichier de test:
for col in columns:
test_data_numerized = features_transformer.transform(test_data, col)
test_data_numerized.head()
| Unnamed: 0 | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | city | ... | host_identity_verified | host_response_rate | instant_bookable | latitude | longitude | neighbourhood | number_of_reviews | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14282777 | 674 | 674 | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | 674 | 674 | True | 674 | ... | 674 | 100% | 674 | 40.696524 | -73.991617 | 674 | 2 | 313 | 1.0 | 1.0 |
| 1 | 17029381 | 674 | 674 | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | 674 | 674 | True | 674 | ... | 674 | 100% | 674 | 40.766115 | -73.989040 | 674 | 6 | 31 | 3.0 | 3.0 |
| 2 | 7824740 | 674 | 674 | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | 674 | 674 | True | 674 | ... | 674 | 100% | 674 | 40.808110 | -73.943756 | 674 | 10 | 6 | 1.0 | 3.0 |
| 3 | 19811650 | 674 | 674 | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | 674 | 674 | True | 674 | ... | 674 | 100% | 674 | 37.772004 | -122.431619 | 674 | 0 | 286 | 2.0 | 2.0 |
| 4 | 12410741 | 674 | 674 | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | 674 | 674 | True | 674 | ... | 674 | 100% | 674 | 38.925627 | -77.034596 | 674 | 4 | 2 | 0.0 | 1.0 |
5 rows × 21 columns
Convertissons maintenant la caractéristique 'cleaning_fee' en integer
train_data['cleaning_fee'] = train_data['cleaning_fee'].astype('int')
test_data['cleaning_fee'] = test_data['cleaning_fee'].astype('int')
Convertissons 'host_response_rate' en float
train_data['host_response_rate'] = train_data['host_response_rate'].str.rstrip('%').astype('float') / 100.0
test_data['host_response_rate'] = test_data['host_response_rate'].str.rstrip('%').astype('float') / 100.0
Traitement de la caractéristique 'amenities' : choix de créer des colonnes pour chaque équipement et de remplir 1 si l'équipement est présent et 0 sinon.
def extract_amenities(amenities_str):
amenities_str = amenities_str.strip('{}') # nous retirons les accolades
amenities_list = amenities_str.split(',') #on sépare les équipements de amenities
amenities_list = [amenity.strip().strip('"').strip("'") for amenity in amenities_list] # nous retirons les guillemets supplémentaires
return amenities_list
all_amenities = set()
for amenity in train_data['amenities']:
all_amenities.update(extract_amenities(amenity))
print("Nombre total d'équipement référencé :", len(all_amenities))
Nombre total d'équipement référencé : 125
Nous faisons de même pour le fichier test et nous remarquons qu'il contient des équipements non référencés dans le fichier d'entrainement !
for amenity in test_data['amenities']:
all_amenities.update(extract_amenities(amenity))
print("Nombre total d'équipement référencé :", len(all_amenities))
Nombre total d'équipement référencé : 131
On crée une colonne dans le fichier d'entrainement et de test. Cette colonne total_amenities contient le nombre total d'amenities par airbnb.
train_data['total_amenities'] = train_data['amenities'].apply(lambda x: len(extract_amenities(x)))
test_data['total_amenities'] = test_data['amenities'].apply(lambda x: len(extract_amenities(x)))
On supprime la colonne amenities dans chacun des deux fichers. Elle n'est plus utile.
train_data.drop(columns=['amenities'], inplace=True)
test_data.drop(columns=['amenities'], inplace=True)
Explication:
Nous avons dans un premier temps décidé de créer des colonnes pour chaque équipement et de remplir 1 si l'équipement est présent et 0 sinon.
Cependant, dû au fait que certains amenities sont présentes dans le fichier de test et non d'entrainement, nous avons finalement décidé de faire simplement la somme des aminities pour chaque airbnb.
Selon nous, cela permet de réduire le bruit et de considérer une seule colonne au lieu de plus de 100 colonnes avec des corrélations allant de NaN (du aux amenities présentes dans un fichier et pas l'autre) et passant par -0.20 jusqu'à 0.10.
Vérification que toutes les colonnes sont conformes: conversion numérique réussie ?
non_numeric_columns = train_data.select_dtypes(include=['object']).columns.tolist()
for col in non_numeric_columns:
print(f"{col}: {type(train_data[col][0])}")
print(train_data[non_numeric_columns].head())
property_type: <class 'int'> room_type: <class 'int'> bed_type: <class 'int'> cancellation_policy: <class 'int'> city: <class 'int'> host_has_profile_pic: <class 'int'> host_identity_verified: <class 'int'> instant_bookable: <class 'int'> neighbourhood: <class 'int'> zipcode: <class 'int'> property_type room_type bed_type cancellation_policy city \ 0 0 0 0 0 0 1 0 0 0 1 1 2 1 1 0 0 2 3 0 0 0 0 1 4 0 1 0 2 3 host_has_profile_pic host_identity_verified instant_bookable neighbourhood \ 0 0 0 0 0 1 0 1 0 1 2 0 0 0 2 3 0 1 1 3 4 0 1 1 4 zipcode 0 0 1 1 2 2 3 3 4 4
Tout est bon. Passons aux visualisations.
Visualisations¶
Visualisation de la distribution des prix
plt.figure(figsize=(10, 6))
sns.histplot(train_data['log_price'], kde=True)
plt.title('Distribution de log_price')
plt.xlabel('log_price')
plt.ylabel('Fréquence')
plt.show()
Procédons aux calculs des corrélations entre les caractéristiques.
correlation_matrix = train_data.corr()
Heatmap des corrélations:
sns.heatmap(correlation_matrix)
<Axes: >
Matrice de corrélation plus esthétique des corrélations:
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Matrice de corrélation")
plt.show()
Corrélations du logarithme du prix avec les autres colonnes et sélection des caractéristiques les plus corrélées avec le logarithme du prix.
correlation_with_log_price = correlation_matrix['log_price'].sort_values(ascending=False)
print(correlation_with_log_price)
log_price 1.000000 accommodates 0.564874 bedrooms 0.472728 beds 0.435068 room_type 0.406694 bathrooms 0.368856 total_amenities 0.198436 cleaning_fee 0.115116 city 0.104990 instant_bookable 0.040482 host_identity_verified 0.030285 cancellation_policy 0.026986 property_type 0.018080 host_has_profile_pic 0.013117 host_response_rate 0.005498 latitude 0.005231 id -0.002122 neighbourhood -0.032620 number_of_reviews -0.039138 longitude -0.045348 bed_type -0.097958 zipcode -0.115141 Name: log_price, dtype: float64
Visualisation des corrélations des caractéristiques avec le logarithme du prix
plt.figure(figsize=(12, 8))
correlation_with_log_price = correlation_matrix['log_price'].sort_values(ascending=False)
sns.barplot(x=correlation_with_log_price.index, y=correlation_with_log_price.values)
plt.title('Corrélations des caractéristiques avec log_price')
plt.xticks(rotation=90)
plt.ylabel('Coefficient de corrélation')
plt.show()
Visualisation de la corrélation entre la caractéristique 'accomodates' (la plus corrélée avec 'log_price') avec le logarithme du prix
plt.scatter(train_data['accommodates'],train_data['log_price'])
plt.xlabel("Accomodates")
plt.ylabel("Log du prix en euros")
Text(0, 0.5, 'Log du prix en euros')
Représentation de 3 visualisation différentes:
- Visualisation du logarithme du prix par type de propriété
- Visualisation du logarithme du prix par ville
- Visualisation du logarithme du prix vs le total des amenities
fig, axes = plt.subplots(1, 3, figsize=(18, 6)) # on veut représenter les 3 visualisations côte à côte
# Visualisation du logarithme du prix par type de propriété
sns.boxplot(x='property_type', y='log_price', data=train_data, ax=axes[0])
axes[0].set_title('log_price par type de propriété')
axes[0].set_xticklabels(train_data['property_type'].unique(), rotation=90)
# 2. Visualisation du logarithme du prix par ville
sns.boxplot(x='city', y='log_price', data=train_data, ax=axes[1])
axes[1].set_title('log_price par ville')
axes[1].set_xticklabels(train_data['city'].unique(), rotation=90)
# 3. Visualisation du logarithme du prix vs le total des amenities
sns.scatterplot(x='total_amenities', y='log_price', data=train_data, alpha=0.5, ax=axes[2])
axes[2].set_title('log_price vs total_amenities')
axes[2].set_xlabel('Total des équipements')
axes[2].set_ylabel('log_price')
plt.tight_layout()
plt.show()
C:\Users\markz\AppData\Local\Temp\ipykernel_9924\3587587781.py:6: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[0].set_xticklabels(train_data['property_type'].unique(), rotation=90) C:\Users\markz\AppData\Local\Temp\ipykernel_9924\3587587781.py:11: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[1].set_xticklabels(train_data['city'].unique(), rotation=90)
Carte montrant la répartition des airbnb
import folium
import pandas as pd
# la carte est basée sur la moyenne des latitudes et longitudes
center_lat = train_data['latitude'].mean()
center_long = train_data['longitude'].mean()
m = folium.Map(location=[center_lat, center_long], zoom_start=12)
# on ajoute un point sur la carte pour chaque airbnb
for idx, row in train_data.iterrows():
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=f"Price: {row['log_price']}"
).add_to(m)
m.save("airbnb_map.html") #fichier html de la carte
m